/*-------------------------<-- Start of Description -->-------------------------*\ | PURPOSE: This macro is designed to check for exact duplicate and/or near | | duplicate observations in a SAS dataset. It generates a | | printout of near duplicates for further manual examination and | | notes in the log changes made to the output dataset. | |--------------------------<-- End of Description -->----------------------------| |--------------------------------------------------------------------------------| |------------------------<-- Start of Files Created -->--------------------------| | SYNTAX: %doubles (indata = _input_dataset_, | | outdata = _output_dataset_, | | var = _unique_var_list_, | | nodup = _yes_or_no_) | \-------------------------<-- End of Files Created-->---------------------------*/ %macro doubles (indata=, outdata=, var=, nodup=yes) /des='doubles identifier' ; /*-------------------------------------------------------------\ | turn notes off, begin time stamp, globalize and localize all | | variables, calculate key macro variables. | \-------------------------------------------------------------*/ %let notes=%sysfunc(getoption(notes,keyword)); options nonotes; %let starttime = %sysfunc(datetime()); %local dataset var varnum sortvar varcnt orig nodup del near notes outdata starttime i x; %let varnum = %words (&var); %let sortvar = %scan(&var, &varnum); %if %upcase(%substr(&nodup, 1, 1)) = Y %then %let nodup = nodup noequals; %else %let nodup = %str(); /*--------------------------------------------------\ | verify that variable named X is not in input data | | if so, find another variable name! | +---------------------------------------------------+-------------------+ | in THEORY, this will not work all the time... but someone has to have | | basically be using variales named x0-xINFINITY before it will fail. | | If you have that many variables in your dataset, this utility will | | not be practical and you should code your own-- or change KEYVAR = X | | to KEYVAR = Y | \----------------------------------------------------------------------*/ proc contents data=&indata noprint out=contents (keep = name) ; run ; %let x = x0 ; %do i = 1 %to %eval(%nobs(&indata)+1) ; data _null_ ; set contents nobs=nobs ; /*-------------------------------------------------------\ | if we encounter the variable name, then increment by 1 | | and kick out of the data step, going through the loop | | again, searching for a safe variable name | \-------------------------------------------------------*/ if upcase(name) = "%upcase(&x)" then do ; call symput ('x', 'x'||trim(left(put(_n_, best.)))) ; stop ; end ; /*----------------------------------------------------\ | if the variable name is safe we can kick out of the | | macro DO loop | \----------------------------------------------------*/ else if _n_ = nobs then call symput ('i', %nobs(&indata) + 1) ; run ; %end ; /*---------------------------------------------------------\ | sort dataset by &var, remove exact duplicates if desired | \---------------------------------------------------------*/ proc sort data=&indata &nodup out=&outdata (label = "DOUBLES macro output for %data(&indata)"); by &var; run; %if %nobs(&indata) ne %nobs(&outdata) %then %do ; %let delete = %eval(%nobs(&indata) - %nobs(&outdata)) ; options ¬es ; %put NOTE: &delete duplicate observations were deleted. ; options nonotes ; %end ; /*--------------------------------------------------------\ | create first and second temporary datasets for use with | | proc compare. | \--------------------------------------------------------*/ data temp1 temp2; set &outdata; by &var; if first.&sortvar then output temp1; else output temp2; run; proc compare base=temp1 compare=temp2 noprint out=a outnoequal; by &var; run; /*-----------------------------------------\ | add a marking variable to key dataset t1 | \-----------------------------------------*/ data t1(keep=&var &x) /view=t1; set a; &x = '|'; run; /*-----------------------------------------\ | use key duplicate dataset to check temp1 | \-----------------------------------------*/ data checka /view=checka; merge t1 temp1; by &var; run; /*-----------------------------------------\ | use key duplicate dataset to check temp2 | \-----------------------------------------*/ data checkb /view=checkb; merge t1 temp2; by &var; run; /*---------------------------------------------------------\ | merge data together and keep if data is a near duplicate | \---------------------------------------------------------*/ data check; set checka checkb; by &var; if &x = 'x'; run; /*-------------------------\ | print the duplicate data | \-------------------------*/ proc print data=check; run; /*--------------------------\ | delete temporary datasets | \--------------------------*/ proc datasets library=macro memtype=(data view) nolist; delete a temp1 temp2 t1 checka checkb check; run;quit; /*-----------------------------------\ | Prepare sas session for open code. | \-----------------------------------*/ options ¬es; %datanote (&outdata) options nonotes; %timenote (macro=doubles, starttime=&starttime); %mend doubles ;